knitr::opts_chunk$set(echo = TRUE)
# Librerias
library(dplyr)
library(tidyverse)
library(PASWR)
library(ggplot2)
library(TSstudio)
library(corrplot)
library(DAAG)
library(mvnormtest)
library(normtest)
library(MVN)
library(TSstudio)
Importacion de datos
datos_originales <- readRDS("chicago.rds")
Descripcion de los datos
summary(datos_originales)
## city tmpd dptp date
## Length:6940 Min. :-16.00 Min. :-25.62 Min. :1987-01-01
## Class :character 1st Qu.: 35.00 1st Qu.: 27.00 1st Qu.:1991-10-01
## Mode :character Median : 51.00 Median : 39.88 Median :1996-07-01
## Mean : 50.31 Mean : 40.34 Mean :1996-07-01
## 3rd Qu.: 67.00 3rd Qu.: 55.75 3rd Qu.:2001-04-01
## Max. : 92.00 Max. : 78.25 Max. :2005-12-31
## NA's :1 NA's :2
## pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## Min. : 1.70 Min. : 2.00 Min. : 0.1528 Min. : 6.158
## 1st Qu.: 9.70 1st Qu.: 21.50 1st Qu.:10.0729 1st Qu.:19.654
## Median :14.66 Median : 30.28 Median :18.5218 Median :24.556
## Mean :16.23 Mean : 33.90 Mean :19.4355 Mean :25.232
## 3rd Qu.:20.60 3rd Qu.: 42.00 3rd Qu.:27.0010 3rd Qu.:30.139
## Max. :61.50 Max. :365.00 Max. :66.5875 Max. :62.480
## NA's :4447 NA's :242
glimpse(datos_originales)
## Rows: 6,940
## Columns: 8
## $ city <chr> "chic", "chic", "chic", "chic", "chic", "chic", "chic", "ch~
## $ tmpd <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26.5, 32.5,~
## $ dptp <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26.750, 22.~
## $ date <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 1987-01-05~
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41.00000, 3~
## $ o3tmean2 <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5.833333,~
## $ no2tmean2 <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 25.77233,~
unique(datos_originales$city)
## [1] "chic"
- city: Ciudad de Chicago (no aporta valor: borrar)
- tmpd: temperatura en grados fahrenheit (desde los -16 a 92 F)
- dptp: total de muertes por dia
- date: Dias (desde el 1987-01-01 al 2005-12-31)
- pm25tmean2: cantidad promedio de particulas menores a los 2.5 microgramos por metro cubico (mas preligrosas, presenta muchos datos faltantes)
- pm10tmean2: cantidad promedio de particulas entre 2.5 y 10 microgramos por metro cubico (presenta algunos datos faltantes)
- o3tmean2: Promedio de Ozono medido en partes por millon
- no2tmean2: Dioxido de Nitrogeno promedio en partes por millon
Analisis exploratorio
Datos faltantes
# Grafico pm25tmean2 vs tiempo: NA = -50
grafico1 <- datos_originales %>%
mutate(na_cero_pm25 = ifelse(is.na(pm25tmean2), -50, pm25tmean2)) %>%
dplyr::select(date, na_cero_pm25)
ts_plot(grafico1)
cant_filas = 6940
cant_na_pm25tmean2 = 4447
cant_na_pm25tmean2/cant_filas #% NAs pm25tmean2
## [1] 0.6407781
cant_na_pm10tmean2 = 242
cant_na_pm10tmean2/cant_filas # % NAs pm10tmean2
## [1] 0.03487032
- pm25tmean2
- No hay datos de pm25tmean2 desde 1987 hasta 1998
- Varios de los datos a partir de 1998 tienen datos faltantes pm25tmean2
- 64% de la variable pm25tmean2 es faltante (recomendable eliminar la variable)
- pm10tmean2
- 3% de la variable pm10tmean2 es faltante (se puede filtrar)
Distribucion de variables
# tmpd
EDA(sample(datos_originales$tmpd , size = 5000))
## [1] "sample(datos_originales$tmpd, size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 5000.000 0.000 -16.000 35.500 50.482 51.000 50.934 67.000
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 92.000 19.360 374.793 0.274 31.500 108.000 -0.773 -0.244
## SW p-val
## 0.000
EDA(sample((datos_originales$tmpd)^1.2 , size = 5000))
## [1] "sample((datos_originales$tmpd)^1.2, size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 4985.000 15.000 0.000 72.489 112.669 111.965 113.166 155.342
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 227.272 49.494 2449.648 0.701 82.853 227.272 -0.992 -0.072
## SW p-val
## 0.000
data <- datos_originales
data_numeric <- dplyr::select(data, -date, -city)
library(psych)
multi.hist(x = data_numeric, dcol = c("blue", "red"), dlty = c("dotted", "solid"),
main = "")
- tmpd
- NO se presenta normalidad (según prueba de hipótesis al 95% de confianza), principalmente en las colas
- Sesgo en cola IZQUIERDA (se podria transformar elevando a 1.2)
- Sin valores extremos
- Aparentemente BIMODAL
# dptp
EDA(sample(datos_originales$dptp , size = 5000))
## [1] "sample(datos_originales$dptp, size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 4998.000 2.000 -25.625 27.200 40.250 39.500 40.720 55.425
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 78.250 18.396 338.398 0.260 28.225 103.875 -0.539 -0.242
## SW p-val
## 0.000
EDA(sample((datos_originales$dptp)^1.2 , size = 5000))
## [1] "sample((datos_originales$dptp)^1.2, size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 4914.000 86.000 0.000 53.125 88.270 84.593 88.202 124.928
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 187.146 43.739 1913.114 0.624 71.803 187.146 -1.017 0.085
## SW p-val
## 0.000
- dptp
- NO se presenta normalidad, principalmente en las colas
- Sesgo en cola IZQUIERDA (se podria transformar elevando a 1.2)
- Presenta valores extremos en cola IZQUIERDA
- Aparentemente BIMODAL
# pm25tmean2
EDA(sample(datos_originales$pm25tmean2, size = 5000))
## [1] "sample(datos_originales$pm25tmean2, size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 1791.000 3209.000 1.700 9.775 16.197 14.762 15.543 20.529
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 61.500 8.717 75.979 0.206 10.754 59.800 1.569 1.140
## SW p-val
## 0.000
max_1 <- max(datos_originales$pm25tmean2, na.rm = TRUE)
max_2 <- max(filter(datos_originales, pm25tmean2 != max_1)$pm25tmean2, na.rm = TRUE)
EDA(sample(sqrt(filter(datos_originales, !(pm25tmean2 %in% c(max_1, max_2)))$pm25tmean2), size = 5000))
## [1] "sample(sqrt(filter(datos_originales, !(pm25tmean2 %in% c(max_1, "
## [2] " max_2)))$pm25tmean2), size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 1811.000 3189.000 1.304 3.130 3.899 3.834 3.866 4.561
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 7.211 1.036 1.073 0.024 1.431 5.907 -0.191 0.429
## SW p-val
## 0.000
- pm25tmean2
- NO se presenta normalidad
- Sesgo en cola DERECHA (se podria centrar con RAIZ CUADRADA)
- Presenta valores extremos en cola DERECHA (recomendable FILTRARLOS)
- UNIMODAL
# pm10tmean2
EDA(sample(datos_originales$pm10tmean2, size = 5000))
## [1] "sample(datos_originales$pm10tmean2, size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 4827.000 173.000 2.000 21.500 33.720 30.000 32.310 42.000
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 365.000 18.169 330.127 0.262 20.500 363.000 26.265 2.581
## SW p-val
## 0.000
max_1 <- max(datos_originales$pm10tmean2, na.rm = TRUE)
max_2 <- max(filter(datos_originales, pm10tmean2 != max_1)$pm10tmean2, na.rm = TRUE)
EDA(sample(log(filter(datos_originales, !(pm10tmean2 %in% c(max_1, max_2)))$pm10tmean2), size = 5000))
## [1] "sample(log(filter(datos_originales, !(pm10tmean2 %in% c(max_1, "
## [2] " max_2)))$pm10tmean2), size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 4835.000 165.000 0.693 3.068 3.384 3.401 3.395 3.738
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 4.949 0.521 0.272 0.007 0.670 4.256 0.424 -0.345
## SW p-val
## 0.000
- pm10tmean2
- NO se presenta normalidad
- Sesgo en cola DERECHA (se podria centrar con LOGARITMO)
- Presenta valores extremos en cola DERECHA (recomendable FILTRARLOS)
- UNIMODAL
# o3tmean2
EDA(sample(datos_originales$o3tmean2 , size = 5000))
## [1] "sample(datos_originales$o3tmean2, size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 5000.000 0.000 0.153 10.199 19.453 18.509 18.909 26.995
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 66.588 11.360 129.053 0.161 16.796 66.435 0.032 0.602
## SW p-val
## 0.000
EDA(sample(sqrt(datos_originales$o3tmean2) , size = 5000))
## [1] "sample(sqrt(datos_originales$o3tmean2), size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 5000.000 0.000 0.601 3.190 4.201 4.312 4.204 5.195
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 8.160 1.352 1.829 0.019 2.005 7.559 -0.572 -0.097
## SW p-val
## 0.000
- o3tmean2
- NO se presenta normalidad
- Sesgo en cola DERECHA (se podria centrar con RAIZ CUADRADA)
- Presenta valores extremos en cola DERECHA
- UNIMODAL
# no2tmean2
EDA(sample(datos_originales$no2tmean2, size = 5000))
## [1] "sample(datos_originales$no2tmean2, size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 5000.000 0.000 6.726 19.621 25.168 24.516 24.890 30.001
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 62.480 7.918 62.699 0.112 10.380 55.754 0.516 0.547
## SW p-val
## 0.000
EDA(sample(sqrt(datos_originales$no2tmean2), size = 5000))
## [1] "sample(sqrt(datos_originales$no2tmean2), size = 5000)"
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 5000.000 0.000 2.527 4.443 4.965 4.955 4.961 5.489
## Max. Stdev. Var. SE Mean I.Q.R. Range Kurtosis Skewness
## 7.904 0.800 0.640 0.011 1.046 5.377 0.039 0.069
## SW p-val
## 0.002
- no2tmean2
- NO se presenta normalidad
- Sesgo en cola DERECHA (se podria centrar con RAIZ CUADRADA)
- Presenta valores extremos en cola DERECHA
- UNIMODAL
Correlacion
correlaciones <- cor(datos_originales %>% dplyr::select(-c(city)) %>% mutate(date = as.numeric(date)), use = "complete.obs")
corrplot(correlaciones, method = "number")
* Pareciera haber relacion entre las variables temperatura, muertes por dia y ozono * Las variables de particulas en el aire aparece correlacionadas entre si y con nitrogeno
Limpieza de datos (data cleaning)
ts_plot(dplyr::select(data,tmpd,date),
title = "temperatura en grados fahrenheit (desde los -16 a 92 F)",
Xtitle = "Time",
Ytitle = "temperatura")
ts_plot(dplyr::select(data,pm10tmean2,date),
title = "cantidad promedio de particulas menores a los 2.5 microgramos por metro cubico (mas preligrosas)",
Xtitle = "particulas",
Ytitle = "temperatura")
ts_plot(dplyr::select(data,pm10tmean2,tmpd,date),
title = "cantidad promedio de particulas menores a los 2.5 microgramos por metro cubico vs temperatura en grados fahrenheit (desde los -16 a 92 F)",
Xtitle = "particulas",
Ytitle = "temperatura")
Prueba de normalidad multivariada.
result <- mvn(data = data_numeric, mvnTest = "mardia")
result$multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 3089.53870070102 0 NO
## 2 Mardia Kurtosis 18.4616360991651 0 NO
## 3 MVN <NA> <NA> NO
Los datos no siguen una distribución normal multivariada.
library(GGally)
ggpairs(data, lower = list(continuous = "smooth"),
diag = list(continuous = "barDiag"), axisLabels = "none")
library(psych)
multi.hist(x = data_numeric, dcol = c("blue", "red"), dlty = c("dotted", "solid"),
main = "")
# Pruebas estadísticas t-test.
t.test(data$pm10tmean2,data$tmpd) # where y1 and y2 are numeric
##
## Welch Two Sample t-test
##
## data: data$pm10tmean2 and data$tmpd
## t = -51.267, df = 13611, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -17.04171 -15.78656
## sample estimates:
## mean of x mean of y
## 33.89521 50.30934
t.test(data$pm10tmean2,mu=3)
##
## One Sample t-test
##
## data: data$pm10tmean2
## t = 140.73, df = 6697, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 3
## 95 percent confidence interval:
## 33.46484 34.32557
## sample estimates:
## mean of x
## 33.89521
#—————————————————
Construcción del modelo, seleccionando los predictores de forma aleatoria.
# Multiple Linear Regression Example
fit <- lm(cbind(pm10tmean2, tmpd) ~ dptp + date + pm25tmean2 +
pm10tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, data=data)
summary(fit) # show results
## Response pm10tmean2 :
##
## Call:
## lm(formula = pm10tmean2 ~ dptp + date + pm25tmean2 + pm10tmean2 +
## pm10tmean2 + o3tmean2 + no2tmean2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.816e-14 -1.120e-15 -3.600e-16 3.700e-16 9.852e-13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.226e-14 6.446e-15 1.902e+00 0.0573 .
## dptp 2.849e-16 2.721e-17 1.047e+01 <2e-16 ***
## date 3.535e-20 5.100e-19 6.900e-02 0.9447
## pm25tmean2 1.867e-15 6.224e-17 3.000e+01 <2e-16 ***
## pm10tmean2 1.000e+00 3.955e-17 2.529e+16 <2e-16 ***
## o3tmean2 -4.500e-17 4.453e-17 -1.011e+00 0.3123
## no2tmean2 1.549e-17 7.267e-17 2.130e-01 0.8312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.984e-14 on 2481 degrees of freedom
## (4452 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.394e+32 on 6 and 2481 DF, p-value: < 2.2e-16
##
##
## Response tmpd :
##
## Call:
## lm(formula = tmpd ~ dptp + date + pm25tmean2 + pm10tmean2 + pm10tmean2 +
## o3tmean2 + no2tmean2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1566 -2.6391 -0.2384 2.5063 14.9304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.203e+00 1.234e+00 4.218 2.56e-05 ***
## dptp 9.197e-01 5.207e-03 176.651 < 2e-16 ***
## date 1.169e-04 9.759e-05 1.197 0.231
## pm25tmean2 -3.376e-01 1.191e-02 -28.345 < 2e-16 ***
## pm10tmean2 1.534e-01 7.568e-03 20.272 < 2e-16 ***
## o3tmean2 2.475e-01 8.522e-03 29.038 < 2e-16 ***
## no2tmean2 1.169e-01 1.391e-02 8.409 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.797 on 2481 degrees of freedom
## (4452 observations deleted due to missingness)
## Multiple R-squared: 0.9607, Adjusted R-squared: 0.9606
## F-statistic: 1.01e+04 on 6 and 2481 DF, p-value: < 2.2e-16
coefficients(fit)
## pm10tmean2 tmpd
## (Intercept) 1.226182e-14 5.2027509050
## dptp 2.849274e-16 0.9197470363
## date 3.535004e-20 0.0001168587
## pm25tmean2 1.867385e-15 -0.3376400888
## pm10tmean2 1.000000e+00 0.1534285870
## o3tmean2 -4.500241e-17 0.2474641875
## no2tmean2 1.548984e-17 0.1169446156
confint(fit, level=0.95)
## 2.5 % 97.5 %
## pm10tmean2:(Intercept) -3.784924e-16 2.490213e-14
## pm10tmean2:dptp 2.315775e-16 3.382772e-16
## pm10tmean2:date -9.646340e-19 1.035334e-18
## pm10tmean2:pm25tmean2 1.745330e-15 1.989440e-15
## pm10tmean2:pm10tmean2 1.000000e+00 1.000000e+00
## pm10tmean2:o3tmean2 -1.323246e-16 4.231976e-17
## pm10tmean2:no2tmean2 -1.270102e-16 1.579899e-16
## tmpd:(Intercept) 2.783743e+00 7.621759e+00
## tmpd:dptp 9.095373e-01 9.299567e-01
## tmpd:date -7.451072e-05 3.082282e-04
## tmpd:pm25tmean2 -3.609980e-01 -3.142822e-01
## tmpd:pm10tmean2 1.385875e-01 1.682697e-01
## tmpd:o3tmean2 2.307531e-01 2.641753e-01
## tmpd:no2tmean2 8.967403e-02 1.442152e-01
new_data <- data[1:1000,]
# compare models
fit1 <- lm(cbind(pm10tmean2, tmpd) ~ dptp + date +
pm10tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, data=new_data)
fit2 <- lm(cbind(pm10tmean2, tmpd) ~ dptp + date +
pm10tmean2 + no2tmean2, data=new_data)
anova(fit1, fit2)
## Analysis of Variance Table
##
## Model 1: cbind(pm10tmean2, tmpd) ~ dptp + date + pm10tmean2 + pm10tmean2 +
## o3tmean2 + no2tmean2
## Model 2: cbind(pm10tmean2, tmpd) ~ dptp + date + pm10tmean2 + no2tmean2
## Res.Df Df Gen.var. Pillai approx F num Df den Df Pr(>F)
## 1 820 3.3966e-14
## 2 821 1 3.9757e-14 0.27187 152.9 2 819 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
El modelo con todas las variables introducidas como predictores tiene un R2 considerable (0.6181), es capaz de explicar el 61,81% de la variabilidad observada en cantidad promedio de particulas entre 2.5 y 10 microgramos por metro cubico. El p-value del modelo es significativo (2.2e-16) por lo que se puede aceptar que el modelo no es por azar.
Ahora podemos usar el comando “anova” para comparar el modelo con todas las variables con el modelo con solo las variables significativas para entender si los resultados son estadísticamente diferentes.
Explorar los datos univariados y bivariados
summary(data)
## city tmpd dptp date
## Length:6940 Min. :-16.00 Min. :-25.62 Min. :1987-01-01
## Class :character 1st Qu.: 35.00 1st Qu.: 27.00 1st Qu.:1991-10-01
## Mode :character Median : 51.00 Median : 39.88 Median :1996-07-01
## Mean : 50.31 Mean : 40.34 Mean :1996-07-01
## 3rd Qu.: 67.00 3rd Qu.: 55.75 3rd Qu.:2001-04-01
## Max. : 92.00 Max. : 78.25 Max. :2005-12-31
## NA's :1 NA's :2
## pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## Min. : 1.70 Min. : 2.00 Min. : 0.1528 Min. : 6.158
## 1st Qu.: 9.70 1st Qu.: 21.50 1st Qu.:10.0729 1st Qu.:19.654
## Median :14.66 Median : 30.28 Median :18.5218 Median :24.556
## Mean :16.23 Mean : 33.90 Mean :19.4355 Mean :25.232
## 3rd Qu.:20.60 3rd Qu.: 42.00 3rd Qu.:27.0010 3rd Qu.:30.139
## Max. :61.50 Max. :365.00 Max. :66.5875 Max. :62.480
## NA's :4447 NA's :242
pairs(data_numeric)
Creamos el vector de variables respuesta en función de las predictoras, y corremos el modelo de regresión multivariado.
mlm1 <- lm(cbind(tmpd, dptp) ~ pm25tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, data = data)
summary(mlm1)
## Response tmpd :
##
## Call:
## lm(formula = tmpd ~ pm25tmean2 + pm10tmean2 + o3tmean2 + no2tmean2,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.677 -8.715 2.135 10.380 33.556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.89284 1.23669 27.406 <2e-16 ***
## pm25tmean2 -0.05942 0.04363 -1.362 0.173
## pm10tmean2 0.41926 0.02745 15.274 <2e-16 ***
## o3tmean2 0.88518 0.02843 31.139 <2e-16 ***
## no2tmean2 -0.50274 0.04981 -10.094 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.05 on 2483 degrees of freedom
## (4452 observations deleted due to missingness)
## Multiple R-squared: 0.4608, Adjusted R-squared: 0.4599
## F-statistic: 530.5 on 4 and 2483 DF, p-value: < 2.2e-16
##
##
## Response dptp :
##
## Call:
## lm(formula = dptp ~ pm25tmean2 + pm10tmean2 + o3tmean2 + no2tmean2,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.663 -9.691 1.642 10.834 37.900
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.68183 1.29551 22.911 < 2e-16 ***
## pm25tmean2 0.30355 0.04570 6.642 3.79e-11 ***
## pm10tmean2 0.28936 0.02875 10.063 < 2e-16 ***
## o3tmean2 0.69273 0.02978 23.263 < 2e-16 ***
## no2tmean2 -0.67393 0.05218 -12.917 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.72 on 2483 degrees of freedom
## (4452 observations deleted due to missingness)
## Multiple R-squared: 0.3558, Adjusted R-squared: 0.3547
## F-statistic: 342.8 on 4 and 2483 DF, p-value: < 2.2e-16
Buscamos entender de forma separada las variables
m1 <- lm(tmpd ~ dptp + pm25tmean2 + pm10tmean2 + o3tmean2, data = data)
summary(m1)
##
## Call:
## lm(formula = tmpd ~ dptp + pm25tmean2 + pm10tmean2 + o3tmean2,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.4361 -2.6675 -0.2679 2.4708 15.2113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.978628 0.239491 37.49 <2e-16 ***
## dptp 0.908187 0.005082 178.72 <2e-16 ***
## pm25tmean2 -0.313487 0.011679 -26.84 <2e-16 ***
## pm10tmean2 0.181416 0.006881 26.36 <2e-16 ***
## o3tmean2 0.232547 0.008377 27.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.85 on 2483 degrees of freedom
## (4452 observations deleted due to missingness)
## Multiple R-squared: 0.9595, Adjusted R-squared: 0.9594
## F-statistic: 1.471e+04 on 4 and 2483 DF, p-value: < 2.2e-16
m2 <- lm(pm25tmean2 ~ tmpd + dptp + pm10tmean2 + o3tmean2 + no2tmean2, data = data)
summary(m2)
##
## Call:
## lm(formula = pm25tmean2 ~ tmpd + dptp + pm10tmean2 + o3tmean2 +
## no2tmean2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.212 -3.295 -0.509 2.804 33.478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.29307 0.56314 9.399 < 2e-16 ***
## tmpd -0.72661 0.02554 -28.448 < 2e-16 ***
## dptp 0.71118 0.02418 29.415 < 2e-16 ***
## pm10tmean2 0.32182 0.01008 31.928 < 2e-16 ***
## o3tmean2 0.10114 0.01427 7.087 1.78e-12 ***
## no2tmean2 0.30490 0.01974 15.448 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.564 on 2482 degrees of freedom
## (4452 observations deleted due to missingness)
## Multiple R-squared: 0.5895, Adjusted R-squared: 0.5887
## F-statistic: 712.9 on 5 and 2482 DF, p-value: < 2.2e-16
lr.model.prob <- predict(m1, type = "response")
lr.model.prob[1:30]
## 4023 4029 4035 4041 4047 4053 4059 4065
## 52.17694 19.13518 23.55272 29.19331 36.25348 35.94015 43.07712 42.73895
## 4071 4077 4083 4089 4095 4101 4107 4110
## 41.90687 40.75831 36.07899 19.32836 46.27270 34.93974 69.82903 45.90900
## 4111 4112 4113 4114 4115 4116 4117 4118
## 45.26716 41.86561 36.36528 44.85610 48.64060 49.87723 42.78989 38.57987
## 4119 4123 4124 4125 4126 4127
## 46.20144 54.05372 47.74179 40.74245 44.87047 44.21020
# diagnostic plots
new_data <- data[1:100,]
fit1 <- lm(cbind(tmpd) ~ pm10tmean2 +dptp + date +
pm10tmean2 + pm10tmean2 + o3tmean2 + no2tmean2, data=new_data)
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
plot(fit1)